In [8]:
import dask.array as da

Import dask and specifically dataframe class

Dask is a thread process scheduler and ...


In [7]:
import numpy as np
x = np.arange(25)

In [3]:
x


Out[3]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24])

In [9]:
x = da.arange(25, chunks=(5,))

y = x ** 2

y

y.visualize()


Out[9]:

In [11]:
da.sqrt(x)[-1].visualize()

x = da.arange(250, chunks=(5,))

x.visualize()


Out[11]:

In [13]:
x = da.ones((15, 15), chunks=(5,5))
x.sum(axis=1).visualize()


Out[13]:

In [14]:
import dask.multiprocessing

y.compute(get = dask.multiprocessing.get)


Out[14]:
array([  0,   1,   4,   9,  16,  25,  36,  49,  64,  81, 100, 121, 144,
       169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576])

In [15]:
import dask.dataframe as dd

In [16]:
cols = ['square_id', 'timestamp', 'country_code',
        'sms_in', 'sms_out','call_in','call_out', 'internet']

dtypes = {'square_id': int, 'timestamp': int, 'countrycode': int, 
          'sms_in': float,'sms_out': float, 'call_in': float, 'call_out': float, 'internet': float}

In [17]:
df = dd.read_csv?

In [ ]:
df = dd.read_csv

MISSING SEPERATOR ARGS FOR SPACE DELIMITED FILE!!!


In [24]:
df_a = dd.read_csv('data/split/*.csv', header=0, names=cols, dtype=dtypes, sep="\t")

In [28]:



dd.Scalar<size-ag..., dtype=int64>